{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5cd3b160-b6fc-4240-bccd-1f0ffe1a77ee",
   "metadata": {},
   "source": [
    "# 第六节、数据清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "76f39886-f41c-4e87-b0dd-ea5e35257ffc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c945e77c-e686-48bb-aaf2-45cf62d1329e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {\n",
    "    'color': ['red','blue','red','green','blue',None,'red'],\n",
    "    'price': [10, 20, 10, 15, 20, 0, np.NaN],\n",
    "}\n",
    "\n",
    "df = pd.DataFrame(\n",
    "    data=data\n",
    ")\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7057566a-919f-469d-9b3a-55b8997d40b5",
   "metadata": {},
   "source": [
    "## （1）重复值处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c5fc7a33-b551-45b2-9afb-fb263ff9553f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2     True\n",
       "3    False\n",
       "4     True\n",
       "5    False\n",
       "6    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 判断是否存在重复数据\n",
    "df.duplicated()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0043b907-832e-4c5f-99b2-2d014f5c6d30",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "3  green   15.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 删除重复数据\n",
    "df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "de188ce5-ff3a-4af7-a3d7-877403e1ea50",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 如果不给定inplace=True，那么原来的数据并不会被改变\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7de09dde-2f30-47e4-a04c-caf9d378936e",
   "metadata": {},
   "source": [
    "## （2）缺失值处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "de0b847f-8061-4360-bc38-214488e9e331",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "0  False  False\n",
       "1  False  False\n",
       "2  False  False\n",
       "3  False  False\n",
       "4  False  False\n",
       "5   True  False\n",
       "6  False   True"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 空数据过滤\n",
    "# 判断是否存在空数据\n",
    "df.isnull()    # None 和 np.NaN 都会被认作空数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "3742e52f-7cf4-40c1-b264-f84e098f5fc5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 删除空数据\n",
    "df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "087081d7-1b14-4d72-b98e-1373be477b24",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 删除空数据，指定 how 参数告诉Pandas如何删除\n",
    "df.dropna(how='any')  # 只有一行中有一列是空，就删掉一整行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "e49e0f8a-c69b-4158-adbf-a95b4c913e93",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5      0    0.0\n",
       "6    red    0.0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 填充空数据\n",
    "# 上面直接删除的做法显然太过于极端，如果数据本就不多，这样的删除操作会带来样本的大量减少\n",
    "# 所以填充空数据也是一种处理方法\n",
    "df.fillna(value=0)   # 缺失的地方用value来填补"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fb4035c4-7b57-4639-8bae-fdf484f64c14",
   "metadata": {},
   "source": [
    "## （3）指定行或者列过滤"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c525ecc7-c1e0-487f-8a44-546da8a5221a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "134ceaea-6371-413e-8c8a-e19e812d043f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 直接删除某列\n",
    "# 方式一\n",
    "del df['color']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "4f8f0098-460c-444f-a0a2-9723963b33c5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   price\n",
       "0   10.0\n",
       "1   20.0\n",
       "2   10.0\n",
       "3   15.0\n",
       "4   20.0\n",
       "5    0.0\n",
       "6    NaN"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df  # del关键字来删除，直接改变了原始的df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "0b0ed490-ff1f-4272-abd2-ccfd041d7b7f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 把数据先恢复一下\n",
    "df = pd.DataFrame(\n",
    "    data=data\n",
    ")\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "8183ef00-79e0-4e71-b2ae-33a0f5dd8d25",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color\n",
       "0    red\n",
       "1   blue\n",
       "2    red\n",
       "3  green\n",
       "4   blue\n",
       "5   None\n",
       "6    red"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 指定要删除的列\n",
    "df.drop(labels=['price'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "a3e4c1da-f46a-4f7a-b5ab-9062890666de",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df  # 使用drop来删除，是不会直接修改原df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "cd90afb3-c764-438b-9b72-8d37a05ab1f3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   color  price\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "6    red    NaN"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 删除指定的行\n",
    "df.drop(labels=[0, 1, 5], axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1eff1a15-036d-4c7a-aa10-a1dc5c7baf29",
   "metadata": {},
   "source": [
    "## （4）函数filter的使用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "de52f7c6-01d0-430f-be8c-8b0fb697845b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>China</th>\n",
       "      <th>America</th>\n",
       "      <th>France</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat</th>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>256</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     China  America  France\n",
       "dog      3        7       1\n",
       "cat      2        8     256"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(\n",
    "    data=np.array([[3, 7, 1], [2, 8, 256]]),\n",
    "    index=['dog', 'cat'],\n",
    "    columns=['China', 'America', 'France']\n",
    ")\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "05acc9dd-00f2-47e8-8a62-e80c7bf0db90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>China</th>\n",
       "      <th>France</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat</th>\n",
       "      <td>2</td>\n",
       "      <td>256</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     China  France\n",
       "dog      3       1\n",
       "cat      2     256"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 这都有点不像是清洗数据，这应该是过滤查询找出想要的数据\n",
    "df.filter(items=['China', 'France'])   # items是指定待查询的列名"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "07ad8677-b970-4d6e-8cb3-e227de6cbaca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>China</th>\n",
       "      <th>America</th>\n",
       "      <th>France</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat</th>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>256</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     China  America  France\n",
       "dog      3        7       1\n",
       "cat      2        8     256"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df  # filter函数不会改变原df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "fdb7f444-8c3f-46f9-af0a-926e4465c751",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>China</th>\n",
       "      <th>America</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat</th>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     China  America\n",
       "dog      3        7\n",
       "cat      2        8"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 根据正则表达式删除列标签\n",
    "df.filter(regex='a$', axis=1)   # a结尾的列，axis=1表示列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "398a5aec-0641-4083-8bc0-7f6caa634fdd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>China</th>\n",
       "      <th>America</th>\n",
       "      <th>France</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     China  America  France\n",
       "dog      3        7       1"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 选择行中包含og的行记录\n",
    "df.filter(like='og', axis=0)   # Like是模糊查询的意思，axis=0表示在行中操作"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6d4e93d-4d45-4f6f-be30-712931e7b1a2",
   "metadata": {},
   "source": [
    "## （5）异常值处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "e3e79de7-f94a-4ce5-90d4-4f926529c543",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.470955</td>\n",
       "      <td>-0.587543</td>\n",
       "      <td>1.621444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.521050</td>\n",
       "      <td>-0.720307</td>\n",
       "      <td>0.811096</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.847266</td>\n",
       "      <td>-0.610548</td>\n",
       "      <td>-0.481227</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.677451</td>\n",
       "      <td>-2.703948</td>\n",
       "      <td>2.050491</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.177101</td>\n",
       "      <td>-0.305142</td>\n",
       "      <td>-0.219392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>-1.152157</td>\n",
       "      <td>0.648115</td>\n",
       "      <td>0.919199</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>0.821294</td>\n",
       "      <td>0.049589</td>\n",
       "      <td>-0.182751</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>-0.922348</td>\n",
       "      <td>-1.016397</td>\n",
       "      <td>1.190292</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>-0.668801</td>\n",
       "      <td>-0.334169</td>\n",
       "      <td>1.147863</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>-1.039376</td>\n",
       "      <td>1.040683</td>\n",
       "      <td>-0.465992</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             0         1         2\n",
       "0    -0.470955 -0.587543  1.621444\n",
       "1     0.521050 -0.720307  0.811096\n",
       "2     1.847266 -0.610548 -0.481227\n",
       "3    -0.677451 -2.703948  2.050491\n",
       "4     1.177101 -0.305142 -0.219392\n",
       "...        ...       ...       ...\n",
       "9995 -1.152157  0.648115  0.919199\n",
       "9996  0.821294  0.049589 -0.182751\n",
       "9997 -0.922348 -1.016397  1.190292\n",
       "9998 -0.668801 -0.334169  1.147863\n",
       "9999 -1.039376  1.040683 -0.465992\n",
       "\n",
       "[10000 rows x 3 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.DataFrame(\n",
    "    data=np.random.randn(10000, 3)   # 生成标准正态分布数据\n",
    ")\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "4c1e5128-45db-49f3-9a0e-5afd3602f4f8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.002837\n",
       "1    1.000904\n",
       "2    0.997103\n",
       "dtype: float64"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 使用3σ原则，来定义异常值\n",
    "sigma = df2.std()\n",
    "sigma"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5255c67-7591-45bf-ae66-2634a2a5ea1f",
   "metadata": {},
   "source": [
    "定义异常值x：$-3\\sigma < x < 3\\sigma$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "9c4dfa7e-dd78-403a-9e61-2422dd528f94",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          0      1      2\n",
       "0     False  False  False\n",
       "1     False  False  False\n",
       "2     False  False  False\n",
       "3     False  False  False\n",
       "4     False  False  False\n",
       "...     ...    ...    ...\n",
       "9995  False  False  False\n",
       "9996  False  False  False\n",
       "9997  False  False  False\n",
       "9998  False  False  False\n",
       "9999  False  False  False\n",
       "\n",
       "[10000 rows x 3 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.abs(df2) > 3*sigma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "ce8e3ea4-387d-4622-9ad3-87df3948e23d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       False\n",
       "1       False\n",
       "2       False\n",
       "3       False\n",
       "4       False\n",
       "        ...  \n",
       "9995    False\n",
       "9996    False\n",
       "9997    False\n",
       "9998    False\n",
       "9999    False\n",
       "Length: 10000, dtype: bool"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 只要有一个是True那就是True，\n",
    "# 意思就是这一行数据，只要有一列出现了异常值，那么这一行数据都算是异常数据。\n",
    "cond = (np.abs(df2) > 3*sigma).any(axis=1) \n",
    "cond"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "06d645bc-3ae1-443c-becd-bf683cdd582a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>0.070803</td>\n",
       "      <td>-0.024811</td>\n",
       "      <td>3.024191</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78</th>\n",
       "      <td>1.636339</td>\n",
       "      <td>0.442549</td>\n",
       "      <td>-3.307476</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>93</th>\n",
       "      <td>0.394037</td>\n",
       "      <td>-3.304580</td>\n",
       "      <td>0.224551</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>-3.022800</td>\n",
       "      <td>0.372509</td>\n",
       "      <td>-0.869801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>175</th>\n",
       "      <td>3.100991</td>\n",
       "      <td>0.135780</td>\n",
       "      <td>0.058671</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9280</th>\n",
       "      <td>3.332708</td>\n",
       "      <td>1.434332</td>\n",
       "      <td>-1.187383</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9343</th>\n",
       "      <td>-0.168824</td>\n",
       "      <td>0.778089</td>\n",
       "      <td>-3.465733</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9481</th>\n",
       "      <td>-3.677666</td>\n",
       "      <td>0.795599</td>\n",
       "      <td>0.497770</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9615</th>\n",
       "      <td>-1.115811</td>\n",
       "      <td>-3.322262</td>\n",
       "      <td>0.137059</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9917</th>\n",
       "      <td>0.026513</td>\n",
       "      <td>-3.399994</td>\n",
       "      <td>-0.938862</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>84 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             0         1         2\n",
       "25    0.070803 -0.024811  3.024191\n",
       "78    1.636339  0.442549 -3.307476\n",
       "93    0.394037 -3.304580  0.224551\n",
       "110  -3.022800  0.372509 -0.869801\n",
       "175   3.100991  0.135780  0.058671\n",
       "...        ...       ...       ...\n",
       "9280  3.332708  1.434332 -1.187383\n",
       "9343 -0.168824  0.778089 -3.465733\n",
       "9481 -3.677666  0.795599  0.497770\n",
       "9615 -1.115811 -3.322262  0.137059\n",
       "9917  0.026513 -3.399994 -0.938862\n",
       "\n",
       "[84 rows x 3 columns]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 找到异常值\n",
    "df2[cond]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "86fd7309-293c-4868-bb5c-efa423da79ce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([  25,   78,   93,  110,  175,  609,  986, 1044, 1379, 1501, 1521, 1599,\n",
       "       1847, 2048, 2084, 2201, 2217, 2349, 2595, 2640, 2773, 2786, 3011, 3050,\n",
       "       3080, 3184, 3225, 3262, 3342, 3405, 3441, 3504, 3567, 3674, 3694, 3960,\n",
       "       4037, 4140, 4195, 4249, 4314, 4445, 4641, 4655, 4762, 4854, 4863, 4868,\n",
       "       4963, 5289, 5584, 5623, 5624, 5875, 6506, 6597, 6775, 6837, 6920, 7036,\n",
       "       7072, 7113, 7233, 7248, 7486, 7596, 7745, 8022, 8054, 8482, 8588, 8712,\n",
       "       8784, 8793, 8842, 8910, 9021, 9054, 9261, 9280, 9343, 9481, 9615, 9917],\n",
       "      dtype='int64')"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 异常值的索引\n",
    "index = df2[cond].index\n",
    "index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "5e530f81-cc41-464b-aba5-d0f1b4ef01ca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.470955</td>\n",
       "      <td>-0.587543</td>\n",
       "      <td>1.621444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.521050</td>\n",
       "      <td>-0.720307</td>\n",
       "      <td>0.811096</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.847266</td>\n",
       "      <td>-0.610548</td>\n",
       "      <td>-0.481227</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.677451</td>\n",
       "      <td>-2.703948</td>\n",
       "      <td>2.050491</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.177101</td>\n",
       "      <td>-0.305142</td>\n",
       "      <td>-0.219392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>-1.152157</td>\n",
       "      <td>0.648115</td>\n",
       "      <td>0.919199</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>0.821294</td>\n",
       "      <td>0.049589</td>\n",
       "      <td>-0.182751</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>-0.922348</td>\n",
       "      <td>-1.016397</td>\n",
       "      <td>1.190292</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>-0.668801</td>\n",
       "      <td>-0.334169</td>\n",
       "      <td>1.147863</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>-1.039376</td>\n",
       "      <td>1.040683</td>\n",
       "      <td>-0.465992</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>9916 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             0         1         2\n",
       "0    -0.470955 -0.587543  1.621444\n",
       "1     0.521050 -0.720307  0.811096\n",
       "2     1.847266 -0.610548 -0.481227\n",
       "3    -0.677451 -2.703948  2.050491\n",
       "4     1.177101 -0.305142 -0.219392\n",
       "...        ...       ...       ...\n",
       "9995 -1.152157  0.648115  0.919199\n",
       "9996  0.821294  0.049589 -0.182751\n",
       "9997 -0.922348 -1.016397  1.190292\n",
       "9998 -0.668801 -0.334169  1.147863\n",
       "9999 -1.039376  1.040683 -0.465992\n",
       "\n",
       "[9916 rows x 3 columns]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 依据行索引删除异常值\n",
    "df2.drop(index=index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c833a091-35c0-4bb5-bdb5-8d9e861f1118",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
